In [8]:
import pandas as pd
import numpy as np
In [9]:
from sklearn.datasets import load_boston
In [10]:
bunch = load_boston()
In [11]:
print(bunch.DESCR)
In [12]:
X, y = pd.DataFrame(data=bunch.data, columns=bunch.feature_names.astype(str)), bunch.target
In [13]:
X.head()
Out[13]:
Зафиксируем генератор случайных чисел для воспроизводимости:
In [14]:
SEED = 22
np.random.seed = SEED
Разделим данные на условно обучающую и отложенную выборки:
In [15]:
from sklearn.model_selection import train_test_split
In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
In [17]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape
Out[17]:
Измерять качество будем с помощью метрики среднеквадратичной ошибки:
In [18]:
from sklearn.metrics import mean_squared_error
In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
clf = LinearRegression()
clf.fit(X_train, y_train);
print('Вышла средняя ошибка, равная %5.4f' % \
(-np.mean(cross_val_score(clf, X_test, y_test, cv=5, scoring='neg_mean_squared_error'))))
In [41]:
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_scaled = ss.fit_transform(X_train)
y_scaled = ss.fit_transform(y_train)
sgd = SGDRegressor()
sgd.fit(X_scaled, y_scaled);
print('Вышла средняя ошибка, равная %5.4f' % \
(-np.mean(cross_val_score(sgd, X_scaled, y_scaled, cv=5, scoring='neg_mean_squared_error'))))
In [61]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV
############Ridge
params = {
'alpha': [10**x for x in range(-2,3)]
}
from sklearn.linear_model import Ridge
gsR = RidgeCV() #GridSearchCV(Ridge(), param_grid=params)
gsR.fit(X_train, y_train);
print('Вышла средняя ошибка, равная %5.4f' % \
(-np.mean(cross_val_score(gsR, X_test, y_test, cv=5, scoring='neg_mean_squared_error'))))
In [63]:
############Lasso
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
gsL = GridSearchCV(Lasso(), param_grid=params) #LassoCV() - медленнее
gsL.fit(X_train, y_train);
print('Вышла средняя ошибка, равная %5.4f' % \
(-np.mean(cross_val_score(gsL, X_test, y_test, cv=5, scoring='neg_mean_squared_error'))))
In [59]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
gsE = GridSearchCV(ElasticNet(), param_grid=params) #ElasticNetCV() - просто заменить, не слишком точен
gsE.fit(X_train, y_train);
print('Вышла средняя ошибка, равная %5.4f' % \
(-np.mean(cross_val_score(gsE, X_test, y_test, cv=5, scoring='neg_mean_squared_error'))))
Итого самый точный среди этих трёх - GridSearchCV + Lasso
Oops! Все случаи уже были рассмотрены для cross_val_score